Let’s load the data and look at it’s dimensions and structure:
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## [1] 506 14
So it appears, that the data has information on 14 variables from 506 individual observations.
library(ggplot2)
library(GGally)
ggpairs(Boston[,!colnames(Boston) %in% c('chas', 'rad')])
# Lets print the summaries without chas and ras which were classes.
summary(Boston[,!colnames(Boston) %in% c('chas', 'rad')])
## crim zn indus nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.3850
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis tax
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. :187.0
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.:279.0
## Median :6.208 Median : 77.50 Median : 3.207 Median :330.0
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean :408.2
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:666.0
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :711.0
## ptratio black lstat medv
## Min. :12.60 Min. : 0.32 Min. : 1.73 Min. : 5.00
## 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95 1st Qu.:17.02
## Median :19.05 Median :391.44 Median :11.36 Median :21.20
## Mean :18.46 Mean :356.67 Mean :12.65 Mean :22.53
## 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :22.00 Max. :396.90 Max. :37.97 Max. :50.00
table(Boston$chas)
##
## 0 1
## 471 35
table(Boston$rad)
##
## 1 2 3 4 5 6 7 8 24
## 20 24 38 110 115 26 17 24 132
Jänniä yhteyksiä.
boston_s <- as.data.frame(scale(Boston))
summary(boston_s)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
boston_s$crime <- cut(boston_s$crim, breaks = quantile(boston_s$crim), include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_s <- dplyr::select(boston_s, -crim)
selection <- sample(x = 1:nrow(boston_s), size = round(0.8*nrow(boston_s)))
train <- boston_s[selection, ]
test <- boston_s[-selection, ]
lda.fit <- lda(crime ~ ., data = train)
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes , pch = classes)
lda.arrows(lda.fit, myscale = 1)
crimecat_actual <- test$crime
test <- dplyr::select(test, -crime)
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = crimecat_actual, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 17 6 0 0
## med_low 5 17 9 0
## med_high 2 5 18 1
## high 0 0 0 21
jännä
# k-means clustering
set.seed(777)
bst <- as.data.frame(scale(Boston))
km_5 <- kmeans(bst, centers = 5)
lda.fit2 <- lda(km_5$cluster ~ ., data = bst)
# target classes as numeric
class <- as.numeric(km_5$cluster)
# plot the lda results
plot(lda.fit2, dimen = 2, col = class , pch = class)
lda.arrows(lda.fit2, myscale = 1)
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 405 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
plotly::plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime)
#plotly::plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km$cluster)